source("utils.R")Prepare DOC data
Read raw data, clean it and compute averages per pixel.
Read data
#|warning: false
# Read column names and types
# NB: file generated based on what we want to read in the dataset, this is much faster than reading the whole dataset and then selecting columns
columns <- read_delim(
"data/raw/0227166/columns.txt",
delim = "\t",
escape_double = FALSE,
trim_ws = TRUE,
show_col_types = FALSE
)
# Read with appropriate column names and types
suppressWarnings(df_doc_raw <- read_excel(
"data/raw/0227166/4.4/data/0-data/All_Basins_Data_Merged_Hansell_2022.xlsx",
skip = 2,
col_names = columns$name,
col_types = columns$type
))
# Select useful columns
df_doc <- df_doc_raw %>%
select(
bottle, bottle_flag_w,
lon = longitude, lat = latitude, date,
press = `ctd pressure`,
doc, doc_flag_w
)Clean
Remove aberrant doc observations (flag = 9, negative values…).
df_doc <- df_doc %>%
filter(!date %in% c(-999, 0)) %>% # missing data
filter(press != -999) %>% # missing depth
filter(!bottle_flag_w %in% c(-999, 9)) %>% # bottle flag is ok
filter(doc_flag_w != 9) %>% # flag is ok
filter(doc > 0) %>% # doc is positive
drop_na(doc) #%>% # drop missing values
# mutate(date = ymd(date)) # format date
summary(df_doc) bottle bottle_flag_w lon lat
Length:98189 Length:98189 Min. :-179.99 Min. :-78.643
Class :character Class :character 1st Qu.:-133.84 1st Qu.:-32.499
Mode :character Mode :character Median : -35.85 Median : 15.383
Mean : -28.34 Mean : 6.568
3rd Qu.: 65.12 3rd Qu.: 40.334
Max. : 213.60 Max. : 89.988
date press doc doc_flag_w
Length:98189 Min. : 0.0 Min. : 13.30 Length:98189
Class :character 1st Qu.: 89.9 1st Qu.: 40.33 Class :character
Mode :character Median : 399.5 Median : 45.20 Mode :character
Mean : 998.9 Mean : 52.00
3rd Qu.:1442.1 3rd Qu.: 56.75
Max. :6903.9 Max. :724.75
ggplot(df_doc) + geom_bar(aes(x = doc_flag_w))There is also a little cleaning to do with dates: the TRACERS (2013) cruise reported dates as 201303X , but after inspecting boat trajectory (available at https://www.bco-dmo.org/dataset/549090) and comparing with DOC sampling location, it appears that those dates should be 2013030X.
Moreover, some dates are YYYYMM00, let’s change them to YYYYMM01.
Let’s fix this.
df_doc <- df_doc %>%
mutate(
# Fix for Tracers 2013
date = ifelse( # if only 7 characters in date, one character is missing, add a 0 before the last character
nchar(date == 7),
paste0(str_extract(date, "^[[:digit:]]{6}"), "0", str_extract(date, "[[:digit:]]{1}$")),
date
),
# Fix for dates ending in 00
date = str_replace(date, "00$", "01"),
date = ymd(date) # format date
)Assign observations to layers
DOC observations are separated in 3 layers:
surface (depth ⩽ 10)
epi pelagic (depth > 10 & depth ⩽ 200)
mesopelagic (depth > 200 & depth ⩽ 1000)
surface (depth > 1000)
df_doc <- df_doc %>%
mutate(layer = case_when(
press <= surface_bottom ~ "surf",
press > surface_bottom & press <= meso_top ~ "epi",
press > meso_top & press <= meso_bottom ~ "meso",
press > meso_bottom ~ "bathy",
)) %>%
filter(!is.na(layer)) # drop observations not assigned to any layerAverage by pixel across the layers
Annual
df_doc_ann <- df_doc %>%
mutate(
# floor and add 0.5 because other env data are on the centre of each pixel
lon = roundp(lon, precision = 1, f = floor) + 0.5,
lat = roundp(lat, precision = 1, f = floor) + 0.5
) %>%
group_by(lon, lat, layer) %>%
summarise(
# compute mean and sd
doc_mean = mean(doc),
doc_sd = sd(doc),
n_obs = n(),
.groups = "drop"
) Let’s plot some maps.
# Number of observations
ggmap(
df_doc_ann,
"n_obs",
type = "point",
palette = ggplot2::scale_colour_viridis_c(trans = "log1p", direction = -1)
) +
ggtitle("Number of observations") +
facet_wrap(~layer, ncol = 2)# DOC mean
ggmap(
df_doc_ann,
"doc_mean",
type = "point",
palette = ggplot2::scale_colour_viridis_c(option = "F", trans = "log1p")
) +
ggtitle("DOC mean") +
facet_wrap(~layer, ncol = 2)# DOC sd
ggmap(
df_doc_ann,
"doc_sd",
type = "point",
palette = ggplot2::scale_colour_viridis_c(option = "E", trans = "log1p", na.value = NA)
) +
ggtitle("DOC sd") +
facet_wrap(~layer, ncol = 2)Store layered doc in multiple columns, in a similar fashion as for env data, and add season.
df_doc_ann <- df_doc_ann %>%
select(lon, lat, layer, mean = doc_mean, sd = doc_sd) %>%
pivot_longer(mean:sd) %>%
unite("layer", layer:name) %>%
pivot_wider(names_from = "layer", values_from = "value") %>%
mutate(season = 0, .after = lat)Seasonal (surface layer only)
Seasons are defined as:
season 1 = DJF
season 2 = MAM
season 3 = JJA
season 4 = SON
df_doc_s <- df_doc %>%
filter(layer == "surf") %>%
mutate(season = quarter(date, fiscal_start = 12)) %>%
# generate seasonal average by pixel
mutate(
# floor and add 0.5 because other env data are on the centre of each pixel
lon = roundp(lon, precision = 1, f = floor) + 0.5,
lat = roundp(lat, precision = 1, f = floor) + 0.5
) %>%
group_by(lon, lat, season) %>%
summarise(
# compute mean and sd
surf_mean = mean(doc),
surf_sd = sd(doc),
.groups = "drop"
)
# Counts per season
ggplot(df_doc_s) + geom_bar(aes(x = season)) + scale_x_continuous(n.breaks = 3, expand = c(0,0))And plot seasonal maps.
ggmap(df_doc_s, "surf_mean", type = "point", palette = ggplot2::scale_colour_viridis_c(option = "A", trans = "log10")) +
facet_wrap(~season) +
ggtitle("DOC mean by season")Few observations during winter at high latitudes. Will see what we can do.
Save DOC data
# Join annual and seasonnal observations together
df_doc <- bind_rows(df_doc_ann, df_doc_s) %>%
# and reorder columns
select(
lon, lat, season,
contains("surf"),
contains("epi"),
contains("meso"),
contains("bathy")
)
### Some sanity checks
## seasonal data is only in surf #ok
#foo %>% filter(season != 0) %>% select(-contains(c("epi", "meso", "bathy"))) %>% summary()
#foo %>% filter(season != 0) %>% select(contains(c("epi", "meso", "bathy"))) %>% summary()
#
## annual data is in all 4 layers
#foo %>% filter(season == 0) %>% summary()
#
## All ok!
# Save
save(df_doc, file = "data/01.doc_data.Rdata")